library(ggplot2)
library(stringi)
library(gridExtra)
library(dendextend)
library(kableExtra)
library(psych)
library(limma)
library(tidyverse)
library(preprocessCore)
library(CONSTANd)  # install from source: https://github.com/PDiracDelta/CONSTANd/
library(NOMAD)  # devtools::install_github("carlmurie/NOMAD")
library(vsn)

This notebook presents isobaric labeling data analysis strategy that includes data-driven normalization.

We will check how varying analysis components [summarization/normalization/differential abundance testing methods] changes end results of a quantitative proteomic study.

source('other_functions.R')
source('plotting_functions.R')

# you should either make a symbolic link in this directory
data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format
# dat.w <- data.list$dat.w # data in wide format
if ('X' %in% colnames(dat.l)) { dat.l$X <- NULL }

# remove shared peptides
shared.peptides <- dat.l %>% filter(!shared.peptide)

# keep spectra with isolation interference <30 and no missing quantification channels
dat.l <- dat.l %>% filter(isoInterOk & noNAs)

# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character

# which peptides were identified in each MS run?
unique.pep=dat.l %>% 
  group_by(Run) %>%
  distinct(Peptide) %>% 
  mutate(val=1)
unique.pep <- xtabs(val~Peptide+Run, data=unique.pep)
tmp <- apply(unique.pep, 1, function(x) all(x==1))
inner.peptides <- rownames(unique.pep)[tmp]
# specify # of varying component variants and their names
variant.names <- c('quantile', 'CONSTANd', 'NOMAD', 'medianSweeping', 'CycLoessVSN')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'log', 'log', 'log', 'log')
# pick reference channel and condition for making plots / doing DEA
referenceChannel <- '127C'
referenceCondition <- '0.5'
# specify colours corresponding to biological conditions
condition.colour <- tribble(
  ~Condition, ~Colour,
  "0.125", 'black',
  "0.5", 'blue',
  "0.667", 'green',
  "1", 'red' )
# create data frame with sample info (distinct Run,Channel, Sample, Condition, Colour)
sample.info <- get_sample_info(dat.l, condition.colour)
channelNames <- remove_factors(unique(sample.info$Channel))

1 Unit component: log2 transformation of reporter ion intensities

dat.unit.l <- dat.l %>% mutate(response=log2(Intensity)) %>% select(-Intensity)

2 Normalization component

# switch to wide format
dat.unit.w <- pivot_wider(data = dat.unit.l, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=Channel, values_from=response)
dat.norm.w <- emptyList(variant.names)

2.1 Quantile (1)

We apply quantile normalization to each Run separately, and then re-scale the observations so that the mean observation within in each run is set equal to the mean observation across all runs.

grand_average <- mean(as.matrix(dat.unit.w[,channelNames]))
x.split <- split(dat.unit.w, dat.unit.w$Run)  
x.split.norm <- lapply(x.split, function(y) {
  # apply normalizeQuantiles to each Run separately
  y[,channelNames] <- normalize.quantiles(as.matrix(y[,channelNames]))
  # make averages of all runs equal.
  y[,channelNames] <- y[,channelNames] / mean(colMeans(y[,channelNames])) * grand_average
  return(y)
})
dat.norm.w$quantile <- bind_rows(x.split.norm)

2.2 CONSTANd

# dat.unit.l entries are in long format so all have same colnames and no channelNames
x.split <- split(dat.unit.w, dat.unit.w$Run)  # apply CONSTANd to each Run separately
x.split.norm  <- lapply(x.split, function(y) {
  y[,channelNames] <- CONSTANd(y[,channelNames])$normalized_data
  return(y)
})
dat.norm.w$CONSTANd <- bind_rows(x.split.norm)

2.3 NOMAD

We apply NOMAD on the PSM level instead of the peptide level.

# doRobust=F: use means, like CONSTANd; doLog=F: values are already transformed.
dat.nomadnorm <- nomadNormalization(dat.unit.l$response, dat.unit.l %>% rename(iTRAQ=Channel) %>% as.data.frame, doRobust = FALSE, doiTRAQCorrection = FALSE, doLog = FALSE)
## Running normalization with  464224  number of data points
## Normalizing for factor:  Peptide 
## Normalizing for factor:  Run 
## Normalizing for factor:  iTRAQ 
## Normalizing for factor:  Run Peptide 
## Normalizing for factor:  Run iTRAQ
dat.nomadnorm$x$response <- dat.nomadnorm$y
dat.norm.w$NOMAD <- pivot_wider(data = dat.nomadnorm$x, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=iTRAQ, values_from=response)

2.4 medianSweeping (1)

# subtract the spectrum median log2intensity from the observed log2intensities
dat.norm.w$medianSweeping <- dat.unit.w
dat.norm.w$medianSweeping[,channelNames] <- sweep(dat.norm.w$medianSweeping[,channelNames], 1, apply(dat.norm.w$medianSweeping[,channelNames], 1, median) )

2.5 Cyclic loess + VSN normalization (1)

x.split <- split(dat.unit.w, dat.unit.w$Run)
x.split.norm  <- lapply(x.split, function(y) {
  y[,channelNames] <- normalizeCyclicLoess(y[,channelNames], span=0.005, iterations=15)
  return(y)
})
dat.norm.w$CycLoessVSN <- bind_rows(x.split.norm)
vsn.tmp <- vsn2(as.matrix(dat.norm.w$CycLoessVSN[,channelNames]), strata=dat.norm.w$CycLoessVSN$Run)
dat.norm.w$CycLoessVSN[,channelNames] <- vsn.tmp@hx

3 Summarization component

Summarize quantification values from PSM to peptide (first step) to protein (second step).

3.1 Median summarization (PSM to peptide to protein)

# normalized data
dat.norm.summ.w <- lapply(dat.norm.w, function(x) {
  # group by (run,)protein,peptide then summarize twice (once on each level)
  # add select() statement because summarise_at is going bananas over character columns
  y <- x %>% group_by(Run, Protein, Peptide) %>% select(Run, Protein, Peptide, channelNames) %>% summarise_at(.vars = channelNames, .funs = median) %>% select(Run, Protein, channelNames) %>% summarise_at(.vars = channelNames, .funs = median) %>% ungroup()
  return(y)
})

Let’s also summarize the non-normalized data for in the comparison section.

# non-normalized data
# add select() statement because summarise_at is going bananas over character columns
dat.nonnorm.summ.w <- dat.unit.w %>% group_by(Run, Protein, Peptide) %>% select(Run, Protein, Peptide, channelNames) %>% summarise_at(.vars = channelNames, .funs = median) %>% select(Run, Protein, channelNames) %>% summarise_at(.vars = channelNames, .funs = median) %>% ungroup()
# make data completely wide (also across runs)
## normalized data
dat.norm.summ.w2 <- lapply(dat.norm.summ.w, function(x) {
  return(x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}"))
})
colnames(dat.norm.summ.w2)
## NULL

4 Normalization component (2)

4.1 Quantile (2)

# apply normalizeQuantiles again, now to the data from all runs simultaneously
dat.norm.summ.w2$quantile[,sample.info$Sample] <- normalize.quantiles(as.matrix(dat.norm.summ.w2$quantile[,sample.info$Sample]))
dat.norm.summ.w2
## $quantile
## # A tibble: 4,083 x 33
##    Protein `Mixture1_1:127… `Mixture1_2:127… `Mixture2_1:127… `Mixture2_2:127…
##    <fct>              <dbl>            <dbl>            <dbl>            <dbl>
##  1 A0AVT1              14.2             14.7             15.0             14.0
##  2 A0FGR8              14.2             14.8             15.2             14.4
##  3 A0MZ66              14.9             14.7             13.9             14.5
##  4 A1L0T0              14.3             13.6             NA               14.2
##  5 A3KMH1              11.0             11.1             10.5             NA  
##  6 A4D1E9              15.2             15.0             NA               14.4
##  7 A5YKK6              14.6             NA               NA               14.9
##  8 A6NDG6              10.3             12.2             10.4             15.5
##  9 A6NDU8              13.5             13.8             13.4             13.0
## 10 A6NHQ2              16.6             14.7             NA               NA  
## # … with 4,073 more rows, and 28 more variables: `Mixture1_1:127N` <dbl>,
## #   `Mixture1_2:127N` <dbl>, `Mixture2_1:127N` <dbl>, `Mixture2_2:127N` <dbl>,
## #   `Mixture1_1:128C` <dbl>, `Mixture1_2:128C` <dbl>, `Mixture2_1:128C` <dbl>,
## #   `Mixture2_2:128C` <dbl>, `Mixture1_1:128N` <dbl>, `Mixture1_2:128N` <dbl>,
## #   `Mixture2_1:128N` <dbl>, `Mixture2_2:128N` <dbl>, `Mixture1_1:129C` <dbl>,
## #   `Mixture1_2:129C` <dbl>, `Mixture2_1:129C` <dbl>, `Mixture2_2:129C` <dbl>,
## #   `Mixture1_1:129N` <dbl>, `Mixture1_2:129N` <dbl>, `Mixture2_1:129N` <dbl>,
## #   `Mixture2_2:129N` <dbl>, `Mixture1_1:130C` <dbl>, `Mixture1_2:130C` <dbl>,
## #   `Mixture2_1:130C` <dbl>, `Mixture2_2:130C` <dbl>, `Mixture1_1:130N` <dbl>,
## #   `Mixture1_2:130N` <dbl>, `Mixture2_1:130N` <dbl>, `Mixture2_2:130N` <dbl>
## 
## $CONSTANd
## # A tibble: 4,083 x 33
##    Protein `Mixture1_1:127… `Mixture1_2:127… `Mixture2_1:127… `Mixture2_2:127…
##    <fct>              <dbl>            <dbl>            <dbl>            <dbl>
##  1 A0AVT1             0.999            0.991            1.00             1.00 
##  2 A0FGR8             0.993            1.00             1.00             0.999
##  3 A0MZ66             0.997            1.01             1.00             0.997
##  4 A1L0T0             1.01             1.00            NA                1.01 
##  5 A3KMH1             1.05             1.03             0.999           NA    
##  6 A4D1E9             1.01             0.998           NA                1.00 
##  7 A5YKK6             0.995           NA               NA                0.991
##  8 A6NDG6             0.999            0.993            0.995            0.998
##  9 A6NDU8             1.00             0.997            1.01             1.01 
## 10 A6NHQ2             0.995            0.982           NA               NA    
## # … with 4,073 more rows, and 28 more variables: `Mixture1_1:127N` <dbl>,
## #   `Mixture1_2:127N` <dbl>, `Mixture2_1:127N` <dbl>, `Mixture2_2:127N` <dbl>,
## #   `Mixture1_1:128C` <dbl>, `Mixture1_2:128C` <dbl>, `Mixture2_1:128C` <dbl>,
## #   `Mixture2_2:128C` <dbl>, `Mixture1_1:128N` <dbl>, `Mixture1_2:128N` <dbl>,
## #   `Mixture2_1:128N` <dbl>, `Mixture2_2:128N` <dbl>, `Mixture1_1:129C` <dbl>,
## #   `Mixture1_2:129C` <dbl>, `Mixture2_1:129C` <dbl>, `Mixture2_2:129C` <dbl>,
## #   `Mixture1_1:129N` <dbl>, `Mixture1_2:129N` <dbl>, `Mixture2_1:129N` <dbl>,
## #   `Mixture2_2:129N` <dbl>, `Mixture1_1:130C` <dbl>, `Mixture1_2:130C` <dbl>,
## #   `Mixture2_1:130C` <dbl>, `Mixture2_2:130C` <dbl>, `Mixture1_1:130N` <dbl>,
## #   `Mixture1_2:130N` <dbl>, `Mixture2_1:130N` <dbl>, `Mixture2_2:130N` <dbl>
## 
## $NOMAD
## # A tibble: 4,083 x 33
##    Protein `Mixture1_1:127… `Mixture1_2:127… `Mixture2_1:127… `Mixture2_2:127…
##    <fct>              <dbl>            <dbl>            <dbl>            <dbl>
##  1 A0AVT1          -0.00791          -0.125           0.00800           0.0331
##  2 A0FGR8          -0.104             0.0339         -0.00415          -0.0203
##  3 A0MZ66          -0.0427            0.120           0.00582          -0.0431
##  4 A1L0T0           0.0953            0.0311         NA                 0.152 
##  5 A3KMH1           0.549             0.335           0.00572          NA     
##  6 A4D1E9           0.187            -0.0381         NA                 0.0706
##  7 A5YKK6          -0.0721           NA              NA                -0.145 
##  8 A6NDG6          -0.00381          -0.0785         -0.0302           -0.0421
##  9 A6NDU8           0.0250           -0.0416          0.166             0.136 
## 10 A6NHQ2          -0.0905           -0.279          NA                NA     
## # … with 4,073 more rows, and 28 more variables: `Mixture1_1:127N` <dbl>,
## #   `Mixture1_2:127N` <dbl>, `Mixture2_1:127N` <dbl>, `Mixture2_2:127N` <dbl>,
## #   `Mixture1_1:128C` <dbl>, `Mixture1_2:128C` <dbl>, `Mixture2_1:128C` <dbl>,
## #   `Mixture2_2:128C` <dbl>, `Mixture1_1:128N` <dbl>, `Mixture1_2:128N` <dbl>,
## #   `Mixture2_1:128N` <dbl>, `Mixture2_2:128N` <dbl>, `Mixture1_1:129C` <dbl>,
## #   `Mixture1_2:129C` <dbl>, `Mixture2_1:129C` <dbl>, `Mixture2_2:129C` <dbl>,
## #   `Mixture1_1:129N` <dbl>, `Mixture1_2:129N` <dbl>, `Mixture2_1:129N` <dbl>,
## #   `Mixture2_2:129N` <dbl>, `Mixture1_1:130C` <dbl>, `Mixture1_2:130C` <dbl>,
## #   `Mixture2_1:130C` <dbl>, `Mixture2_2:130C` <dbl>, `Mixture1_1:130N` <dbl>,
## #   `Mixture1_2:130N` <dbl>, `Mixture2_1:130N` <dbl>, `Mixture2_2:130N` <dbl>
## 
## $medianSweeping
## # A tibble: 4,083 x 33
##    Protein `Mixture1_1:127… `Mixture1_2:127… `Mixture2_1:127… `Mixture2_2:127…
##    <fct>              <dbl>            <dbl>            <dbl>            <dbl>
##  1 A0AVT1          -0.0215          -0.140            -0.0269          0.00507
##  2 A0FGR8          -0.130            0.0122           -0.0618         -0.0824 
##  3 A0MZ66          -0.0429           0.0990           -0.0530         -0.102  
##  4 A1L0T0           0.0111           0.00937          NA               0.0561 
##  5 A3KMH1           0.563            0.216            -0.0436         NA      
##  6 A4D1E9           0.169           -0.0629           NA               0.00864
##  7 A5YKK6          -0.0737          NA                NA              -0.239  
##  8 A6NDG6          -0.0733          -0.0701           -0.129          -0.0980 
##  9 A6NDU8          -0.00860         -0.105             0.0997          0.0165 
## 10 A6NHQ2          -0.120           -0.208            NA              NA      
## # … with 4,073 more rows, and 28 more variables: `Mixture1_1:127N` <dbl>,
## #   `Mixture1_2:127N` <dbl>, `Mixture2_1:127N` <dbl>, `Mixture2_2:127N` <dbl>,
## #   `Mixture1_1:128C` <dbl>, `Mixture1_2:128C` <dbl>, `Mixture2_1:128C` <dbl>,
## #   `Mixture2_2:128C` <dbl>, `Mixture1_1:128N` <dbl>, `Mixture1_2:128N` <dbl>,
## #   `Mixture2_1:128N` <dbl>, `Mixture2_2:128N` <dbl>, `Mixture1_1:129C` <dbl>,
## #   `Mixture1_2:129C` <dbl>, `Mixture2_1:129C` <dbl>, `Mixture2_2:129C` <dbl>,
## #   `Mixture1_1:129N` <dbl>, `Mixture1_2:129N` <dbl>, `Mixture2_1:129N` <dbl>,
## #   `Mixture2_2:129N` <dbl>, `Mixture1_1:130C` <dbl>, `Mixture1_2:130C` <dbl>,
## #   `Mixture2_1:130C` <dbl>, `Mixture2_2:130C` <dbl>, `Mixture1_1:130N` <dbl>,
## #   `Mixture1_2:130N` <dbl>, `Mixture2_1:130N` <dbl>, `Mixture2_2:130N` <dbl>
## 
## $CycLoessVSN
## # A tibble: 4,083 x 33
##    Protein `Mixture1_1:127… `Mixture1_2:127… `Mixture2_1:127… `Mixture2_2:127…
##    <fct>              <dbl>            <dbl>            <dbl>            <dbl>
##  1 A0AVT1              9.92             9.81             9.90             10.1
##  2 A0FGR8              9.92             9.81             9.90             10.1
##  3 A0MZ66              9.92             9.81             9.90             10.1
##  4 A1L0T0              9.92             9.81            NA                10.1
##  5 A3KMH1              9.92             9.81             9.89             NA  
##  6 A4D1E9              9.92             9.81            NA                10.1
##  7 A5YKK6              9.92            NA               NA                10.1
##  8 A6NDG6              9.92             9.81             9.89             10.1
##  9 A6NDU8              9.92             9.81             9.90             10.1
## 10 A6NHQ2              9.93             9.81            NA                NA  
## # … with 4,073 more rows, and 28 more variables: `Mixture1_1:127N` <dbl>,
## #   `Mixture1_2:127N` <dbl>, `Mixture2_1:127N` <dbl>, `Mixture2_2:127N` <dbl>,
## #   `Mixture1_1:128C` <dbl>, `Mixture1_2:128C` <dbl>, `Mixture2_1:128C` <dbl>,
## #   `Mixture2_2:128C` <dbl>, `Mixture1_1:128N` <dbl>, `Mixture1_2:128N` <dbl>,
## #   `Mixture2_1:128N` <dbl>, `Mixture2_2:128N` <dbl>, `Mixture1_1:129C` <dbl>,
## #   `Mixture1_2:129C` <dbl>, `Mixture2_1:129C` <dbl>, `Mixture2_2:129C` <dbl>,
## #   `Mixture1_1:129N` <dbl>, `Mixture1_2:129N` <dbl>, `Mixture2_1:129N` <dbl>,
## #   `Mixture2_2:129N` <dbl>, `Mixture1_1:130C` <dbl>, `Mixture1_2:130C` <dbl>,
## #   `Mixture2_1:130C` <dbl>, `Mixture2_2:130C` <dbl>, `Mixture1_1:130N` <dbl>,
## #   `Mixture1_2:130N` <dbl>, `Mixture2_1:130N` <dbl>, `Mixture2_2:130N` <dbl>

4.2 MedianSweeping (2)

# medianSweeping: in each channel, subtract median computed across all proteins within the channel, separately for each MS run.
x.split <- split(dat.norm.summ.w$medianSweeping, dat.norm.summ.w$medianSweeping$Run)  
x.split.norm  <- lapply(x.split, function(y) {
  y[,channelNames] <- sweep(y[,channelNames], 2, apply(y[,channelNames], 2, median) )
  return(y)
})
dat.norm.summ.w$medianSweeping <- bind_rows(x.split.norm)

4.3 VSN (2)

# additional VSN on protein level data applied to all runs 
vsn.tmp <- vsn2(as.matrix(dat.norm.summ.w2$CycLoessVSN %>% select(-Protein)))
dat.norm.summ.w2$CycLoessVSN[, colnames(dat.norm.summ.w2$CycLoessVSN)!='Protein'] <- vsn.tmp@hx

5 QC plots

# make data completely wide (also across runs)

## non-normalized data
dat.nonnorm.summ.w2 <- dat.nonnorm.summ.w %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}")

## normalized data
# we already did this earlier, right before the second stage of quantile normalization!
dat.norm.summ.w2 <- lapply(dat.norm.summ.w, function(x) {
  dat.tmp <- x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_sep = ":")
  dat.tmp <- flip_colnames(dat.tmp, 'Protein')
  return(dat.tmp)
})

5.1 Boxplots

# use (half-)wide format
par(mfrow=c(3,2))
boxplot_w(dat.nonnorm.summ.w,sample.info, 'Raw')
for (i in 1: n.comp.variants){
  boxplot_w(dat.norm.summ.w[[i]], sample.info, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1, 1))

5.2 MA plots

MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).

par(mfrow=c(3,2))
# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function 
# use wide2 format
p <- vector('list', n.comp.variants+1)
p[[1]] <- maplot_ils(dat.nonnorm.summ.w2, 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], 'Raw')
for (i in 1: n.comp.variants){
 p[[i+1]]<- maplot_ils(dat.norm.summ.w2[[i]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Normalized', variant.names[i], sep='_'))
}
grid.arrange(grobs = p, ncol=2, nrow=3)

par(mfrow=c(1, 1))

MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).

par(mfrow=c(3,2))
# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function 
channels.num <- sample.info %>% filter(Condition=='1') %>% select(Sample) %>% pull
channels.denom <- sample.info %>% filter(Condition=='0.125') %>% select(Sample) %>% pull

p <- vector('list', n.comp.variants+1)
p[[1]] <- maplot_ils(dat.nonnorm.summ.w2, channels.num, channels.denom, scale.vec[i], 'Raw')

for (i in 1: n.comp.variants){
 p[[i+1]]<- maplot_ils(dat.norm.summ.w2[[i]], channels.num, channels.denom, scale.vec[i], paste('Normalized', variant.names[i], sep='_'))
}
grid.arrange(grobs = p, ncol=2, nrow=3)

par(mfrow=c(1, 1))

5.3 CV (coefficient of variation) plots

par(mfrow=c(3,2))
dat.nonnorm.summ.l <- lapply(list(dat.nonnorm.summ.w), function(x){
  x$Mixture <- unlist(lapply(stri_split(x$Run,fixed='_'), function(y) y[1]))
  x <- to_long_format(x, sample.info)
})

dat.norm.summ.l <- lapply(dat.norm.summ.w, function(x){
  x$Mixture <- unlist(lapply(stri_split(x$Run,fixed='_'), function(y) y[1]))
  x <- to_long_format(x, sample.info)
})

par(mfrow=c(2, 2))
  cvplot_ils(dat=dat.nonnorm.summ.l[[1]], feature.group='Protein', xaxis.group='Condition',
               title='Raw', abs=F)

for (i in 1: n.comp.variants){
    cvplot_ils(dat=dat.norm.summ.l[[i]], feature.group='Protein', xaxis.group='Condition', 
               title=paste('Normalized', variant.names[i], sep='_'), abs=F)
}

par(mfrow=c(1, 1))  

5.4 PCA plots

5.4.1 Using all proteins

par(mfrow=c(3,2))
  pcaplot_ils(dat.nonnorm.summ.w2 %>% select(-'Protein'), info=sample.info, 'Raw')
  
for (i in seq_along(dat.norm.summ.w2)){
  # select only the spiked.proteins
  pcaplot_ils(dat.norm.summ.w2[[i]] %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1, 1))  

5.4.2 Using spiked proteins only

par(mfrow=c(3,2))
  pcaplot_ils(dat.nonnorm.summ.w2 %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, 'Raw')
  
for (i in seq_along(dat.norm.summ.w2)){
  # select only the spiked.proteins
  pcaplot_ils(dat.norm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1, 1))  

5.5 HC (hierarchical clustering) plots

Only use spiked proteins

par(mfrow=c(3,2))
  dendrogram_ils(dat.nonnorm.summ.w2 %>% filter(Protein %in% spiked.proteins) %>% select(-Protein), info=sample.info, 'Raw')

for (i in seq_along(dat.norm.summ.w2)){
    dendrogram_ils(dat.norm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-Protein), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1, 1))  

6 DEA component: Moderated t-test

TODO: - Also try to log-transform the intensity case, to see if there are large differences in the t-test results. - done. remove this code? NOTE: - actually, lmFit (used in moderated_ttest) was built for log2-transformed data. However, supplying untransformed intensities can also work. This just means that the effects in the linear model are also additive on the untransformed scale, whereas for log-transformed data they are multiplicative on the untransformed scale. Also, there may be a bias which occurs from biased estimates of the population means in the t-tests, as mean(X) is not equal to exp(mean(log(X))).

design.matrix <- get_design_matrix(referenceCondition, sample.info)
dat.dea <- emptyList(names(dat.norm.summ.w2))
for (i in seq_along(dat.norm.summ.w2)) {
  this_scale <- scale.vec[match(names(dat.dea)[i], variant.names)]
  d <- column_to_rownames(as.data.frame(dat.norm.summ.w2[[i]]), 'Protein')
  dat.dea[[i]] <- moderated_ttest(dat=d, design.matrix, scale=this_scale)
}
# also see what the unnormalized results would look like
n.comp.variants <- n.comp.variants + 1
variant.names <- c(variant.names, 'raw')
scale.vec <- c(scale.vec, 'raw')
dat.dea$raw <- moderated_ttest(dat=column_to_rownames(dat.nonnorm.summ.w2, 'Protein'), design.matrix, scale='raw')

7 Results comparison

7.1 Confusion matrix

cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm)
0.125
quantile
CONSTANd
NOMAD
medianSweeping
CycLoessVSN
raw
background spiked background spiked background spiked background spiked background spiked background spiked
not_DE 4064 15 4061 5 4062 4 4061 4 4064 19 4064 15
DE 0 4 3 14 2 15 3 15 0 0 0 4
0.125
quantile CONSTANd NOMAD medianSweeping CycLoessVSN raw
Accuracy 0.996 0.998 0.999 0.998 0.995 0.996
Sensitivity 0.211 0.737 0.789 0.789 0.000 0.211
Specificity 1.000 0.999 1.000 0.999 1.000 1.000
PPV 1.000 0.824 0.882 0.833 NaN 1.000
NPV 0.996 0.999 0.999 0.999 0.995 0.996
0.667
quantile
CONSTANd
NOMAD
medianSweeping
CycLoessVSN
raw
background spiked background spiked background spiked background spiked background spiked background spiked
not_DE 4064 19 4064 15 4064 17 4062 17 4064 19 4064 19
DE 0 0 0 4 0 2 2 2 0 0 0 0
0.667
quantile CONSTANd NOMAD medianSweeping CycLoessVSN raw
Accuracy 0.995 0.996 0.996 0.995 0.995 0.995
Sensitivity 0.000 0.211 0.105 0.105 0.000 0.000
Specificity 1.000 1.000 1.000 1.000 1.000 1.000
PPV NaN 1.000 1.000 0.500 NaN NaN
NPV 0.995 0.996 0.996 0.996 0.995 0.995
1
quantile
CONSTANd
NOMAD
medianSweeping
CycLoessVSN
raw
background spiked background spiked background spiked background spiked background spiked background spiked
not_DE 4064 17 4059 5 4062 5 4060 5 4064 19 4064 17
DE 0 2 5 14 2 14 4 14 0 0 0 2
1
quantile CONSTANd NOMAD medianSweeping CycLoessVSN raw
Accuracy 0.996 0.998 0.998 0.998 0.995 0.996
Sensitivity 0.105 0.737 0.737 0.737 0.000 0.105
Specificity 1.000 0.999 1.000 0.999 1.000 1.000
PPV 1.000 0.737 0.875 0.778 NaN 1.000
NPV 0.996 0.999 0.999 0.999 0.995 0.996

7.2 Scatter plots

TO DO: Piotr: constant NOMAD i RAW q-values (approx. 1) generate error in scatterplots

# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
q.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)

#scatterplot_ils(dat.dea, q.cols, 'p-values') # commented due to error, sd=0 for NOMAD and RAW
scatterplot_ils(dat.dea, logFC.cols, 'log2FC')

7.3 Volcano plots

for (i in 1:n.contrasts){
  volcanoplot_ils(dat.dea, i, spiked.proteins)
}

7.4 Violin plots

Let’s see whether the spiked protein fold changes make sense

# plot theoretical value (horizontal lines) and violin per condition
dat.spiked.logfc <- lapply(dat.dea, function(x) x[spiked.proteins,logFC.cols])
dat.spiked.logfc.l <- lapply(dat.spiked.logfc, function(x) {
  x %>% rename_with(function(y) sapply(y, function(z) strsplit(z, '_')[[1]][2])) %>% pivot_longer(cols = everything(), names_to = 'condition', values_to = 'logFC') %>% add_column(Protein=rep(rownames(x), each=length(colnames(x)))) })
violinplot_ils(lapply(dat.spiked.logfc.l, filter, condition != referenceCondition))

8 Conclusions

9 Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_BE.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_BE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_BE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_BE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] vsn_3.57.0            Biobase_2.49.1        BiocGenerics_0.35.4  
##  [4] NOMAD_0.99.0          dplR_1.7.1            CONSTANd_0.99.4      
##  [7] preprocessCore_1.51.0 forcats_0.5.0         stringr_1.4.0        
## [10] dplyr_1.0.2           purrr_0.3.4           readr_1.4.0          
## [13] tidyr_1.1.2           tibble_3.0.4          tidyverse_1.3.0      
## [16] limma_3.45.19         psych_2.0.9           kableExtra_1.3.1     
## [19] dendextend_1.14.0     gridExtra_2.3         stringi_1.5.3        
## [22] ggplot2_3.3.2        
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1     ellipsis_0.3.1       class_7.3-17        
##  [4] fs_1.5.0             rstudioapi_0.11      farver_2.0.3        
##  [7] affyio_1.59.0        prodlim_2019.11.13   fansi_0.4.1         
## [10] lubridate_1.7.9      xml2_1.3.2           codetools_0.2-16    
## [13] splines_4.0.3        R.methodsS3_1.8.1    mnormt_2.0.2        
## [16] knitr_1.30           jsonlite_1.7.1       pROC_1.16.2         
## [19] caret_6.0-86         broom_0.7.2          dbplyr_1.4.4        
## [22] png_0.1-7            R.oo_1.24.0          BiocManager_1.30.10 
## [25] compiler_4.0.3       httr_1.4.2           backports_1.1.10    
## [28] assertthat_0.2.1     Matrix_1.2-18        cli_2.1.0           
## [31] htmltools_0.5.0      tools_4.0.3          gtable_0.3.0        
## [34] glue_1.4.2           affy_1.67.1          reshape2_1.4.4      
## [37] Rcpp_1.0.5           cellranger_1.1.0     vctrs_0.3.4         
## [40] nlme_3.1-150         iterators_1.0.13     timeDate_3043.102   
## [43] xfun_0.18            gower_0.2.2          rvest_0.3.6         
## [46] lifecycle_0.2.0      XML_3.99-0.5         zlibbioc_1.35.0     
## [49] MASS_7.3-53          scales_1.1.1         ipred_0.9-9         
## [52] hms_0.5.3            yaml_2.2.1           rpart_4.1-15        
## [55] highr_0.8            foreach_1.5.1        e1071_1.7-4         
## [58] lava_1.6.8           rlang_0.4.8          pkgconfig_2.0.3     
## [61] matrixStats_0.57.0   evaluate_0.14        lattice_0.20-41     
## [64] recipes_0.1.14       labeling_0.4.2       tidyselect_1.1.0    
## [67] plyr_1.8.6           magrittr_1.5         R6_2.4.1            
## [70] generics_0.0.2       DBI_1.1.0            pillar_1.4.6        
## [73] haven_2.3.1          withr_2.3.0          mgcv_1.8-33         
## [76] survival_3.2-7       nnet_7.3-14          modelr_0.1.8        
## [79] crayon_1.3.4         utf8_1.1.4           tmvnsim_1.0-2       
## [82] rmarkdown_2.5        viridis_0.5.1        grid_4.0.3          
## [85] readxl_1.3.1         data.table_1.13.2    blob_1.2.1          
## [88] ModelMetrics_1.2.2.2 reprex_0.3.0         digest_0.6.27       
## [91] webshot_0.5.2        R.utils_2.10.1       signal_0.7-6        
## [94] stats4_4.0.3         munsell_0.5.0        viridisLite_0.3.0